Problem 1

knitr::include_graphics("q1.jpeg")

$$ \[\begin{align} &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \lambda∫[g^{(m)}(x)]^2 dx) \\\\ &a). \lambda = \infty, m = 0 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 0, } g^0=g, \\ &\text{thus to min the RSS we have to set g(x) = 0} \\\\ &b). \lambda = \infty, m = 1 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g'(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 1, } g^1=g', \\ &\text{thus to min the RSS we have to set g'(x) = 0 and g(x) = constant} \\\\ &c). \lambda = \infty, m = 2 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g''(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 2, } g^2=g'',\\ &\text{thus to min the RSS we have to set g''(x) = 0 and g(x) is linear function } = ax + b \\\\ &d). \lambda = \infty, m = 3 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g'''(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 3, } g^3=g''', \\ &\text{thus to min the RSS we have to set g'''(x) = 0 and g(x) is quadratic function } = ax^2 + bx +c \\\\ &e).\lambda = 0, m = 3 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + 0*∫[g''(x)]^2 dx) \\ &\text{the penalty is 0 when} \lambda = 0. \text{ When m = 3, } g^3=g''',\text{thus we can have g(x) interpolate all samples and have RSS = 0 } \\ \end{align}\] $$

Problem 2

$$ \[\begin{align} &\hat{f} (X)=2+3I(0≤X≤2)−3(X+1)I(1≤X≤2)-2(2X −2)I(3 ≤ X ≤ 4) + 2I(4<X≤5) \\\\ &\text{When } -2 \leq X < 0 \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 2 \\ &\text{None of the indicator variables are true; the slope is 0; the intecept is 2.} \\ \\ &\text{When 0} \leq X \leq 1: \hat{f}(X) = 2 + 3*1 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 5 \\ &\text{The indicator variables I(0≤X≤2) are true; the slope is 0; the intecept is 5.} \\\\ &\text{When } 1 < X \leq 2: \hat{f}(X) = 2 + 3*1 -3(X+1)*1 - 2(2X-2)*0 + 2*0 =-3X+2 \\ &\text{The indicator variables I(1≤X≤2) are true; the slope is -3; the intecept is 2.} \\\\ &\text{When } 2 < X < 3: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 2 \\ &\text{None of the indicator variables are true; the slope is 0; the intecept is 2.} \\\\ &\text{When } 3 \leq X \leq 4: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*1 + 2*0 = -4X + 6 \\ &\text{The indicator variables I(3 ≤ X ≤ 4) are true; the slope is -4; the intecept is 6.} \\\\ &\text{When } 4 < X \leq 5: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*1 = 4 \\ &\text{The indicator variables I(4<X≤5) are true; the slope is 0; the intecept is 4.} \\\\ &\text{When } 5 < X \leq 6: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 2 \\ &\text{None of the indicator variables are true; the slope is 0; the intecept is 2.} &\end{align}\] $$

knitr::include_graphics("q2.jpeg")

Problem 3

$$ \[\begin{align} &f(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4(X − \psi)_+^3 \\\\ &a). \\ &\text{If for all } X > \psi, f(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4(X − \psi)_+^3\\ &(X − \psi)_+^3 = (X − \psi)^3, X > \psi \\ &f(X) = f_1(X) = (\beta_0 − \beta_4\psi^3) + (\beta_1 + 3\psi^2\beta_4)X + (\beta_2 − 3\beta_4\psi)X^2 + (\beta_3 + \beta_4)X^3 \\ &f(X) \text{ is a cubic polynomial} \\\\ &b). \\ &\text{If for all } X \leq \psi \\ &(X − \psi)_+^3 = 0, \\ & f(X) = f_2(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4 * 0 \\ & f_2(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 \\ &f(X) \text{ is a cubic polynomial} \\\\ &c). \\ &\text{To show f(X) is continuous at } X=\psi \\ &f_1(X) = (\beta_0 − \beta_4\psi^3) + (\beta_1 + 3\psi^2\beta_4)X + (\beta_2 − 3\beta_4\psi)X^2 + (\beta_3 + \beta_4)X^3 \\ &f_1(\psi) = (\beta_0 − \beta_4\psi^3) + (\beta_1 + 3\psi^2\beta_4)\psi + (\beta_2 − 3\beta_4\psi)\psi^2 + (\beta_3 + \beta_4)\psi^3 \\ &f_1(\psi) = \beta_0 − \beta_4\psi^3 + \beta_1\psi + 3\psi^3\beta_4 + \beta_2\psi^2 − 3\beta_4\psi^3 + \beta_3\psi^3 + \beta_4\psi^3 \\ &f_1(\psi) = \beta_0 + \beta_1\psi + \beta_2\psi^2 + \beta_3\psi^3 = f_2(\psi) \\ &\text{f(X) is continuous at } X=\psi \\\\ &d). \\ &\text{To show }f'(X) \text{ is continuous at } X=\psi \\ &f_1'(\psi) = \beta_1 + 2\beta_2\psi + 3\beta_3\psi^2 = f_2'(\psi) \\ &f'(X)\text{ is continuous at } X=\psi \\\\ &e). \\ &\text{To show }f''(X) \text{ is continuous at } X=\psi \\ &f_1''(\psi) = 2\beta_2 + 6\beta_3\psi = f_2'(\psi) \\ &f''(X)\text{ is continuous at } X=\psi \\\\ &\text{Thus f(X) is a cubic spine regardless of the value of the } \beta \end{align}\] $$

Problem 4.

library(ISLR2)
attach(Wage)
head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
set.seed(2)
train <- sample(1:nrow(Wage), nrow(Wage) / 2)
Wage.train <- Wage[train,]
Wage.test <- Wage[-train, ]
# a).polynomial
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:mosaic':
## 
##     logit
## The following object is masked from 'package:lattice':
## 
##     melanoma
set.seed(1)
mse_poly <- rep(0, 10)
for (i in c(1:10)){
  glm.fit <- glm(wage ~ poly(age, i), data = Wage)
  mse_poly[i] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

bestdeg<- which.min(mse_poly) 
print(paste("Best degree using 10-fold CV is:", bestdeg))
## [1] "Best degree using 10-fold CV is: 9"
plot(x = c(1:10), y = mse_poly, type = "l",
     xlab = "Degree of Poly",
     ylab = "10-fold CV MSE", main = "10-fold CV MSE for polynomial model with different degrees")
points(x = c(1:10), y = mse_poly)

library(repr)
fit <- lm(wage ~ poly(age, 9), data=Wage, subset = train)
round(coef(summary(fit)), 3)
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)    112.247      1.034 108.506    0.000
## poly(age, 9)1  500.884     56.741   8.828    0.000
## poly(age, 9)2 -422.836     57.265  -7.384    0.000
## poly(age, 9)3  106.645     56.181   1.898    0.058
## poly(age, 9)4 -106.821     58.092  -1.839    0.066
## poly(age, 9)5    6.480     57.963   0.112    0.911
## poly(age, 9)6   66.865     57.633   1.160    0.246
## poly(age, 9)7  -13.766     57.853  -0.238    0.812
## poly(age, 9)8  -47.206     57.336  -0.823    0.410
## poly(age, 9)9  -95.484     56.694  -1.684    0.092
# polynomial plot with test data
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds_poly <- predict(fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds_poly$fit + 2 * preds_poly$se.fit, 
                  preds_poly$fit - 2 * preds_poly$se.fit)

options(repr.plot.width=12, repr.plot.height=6)

plot(age, wage, xlim=agelims, cex=.5, col="darkgrey", xlab = "age", ylab = "wage")
title("Degree-9 Polynomial")
lines(age.grid, preds_poly$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

print(paste0("the mse is " , mean((predict(fit, data.frame(age = Wage.test$age)) - Wage.test$wage)^2)))
## [1] "the mse is 1598.00616865424"

The MSE of degree 9 polynomial is 1598. I used 10-fold cv to find the degree of polynomials with smallest MSE. The degree is 9. The plot is based on all data. The confidence band is quite obvious on all domain of age especially on the two ends.

# b). stepwise function
fit <- lm(wage ~ cut(age, 7), data=Wage, subset = train)
round(coef(summary(fit)), 3)
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)              80.177      3.423  23.424    0.000
## cut(age, 7)(26.9,35.7]   24.950      4.100   6.085    0.000
## cut(age, 7)(35.7,44.6]   37.189      3.995   9.308    0.000
## cut(age, 7)(44.6,53.4]   38.083      3.976   9.577    0.000
## cut(age, 7)(53.4,62.3]   43.636      4.338  10.059    0.000
## cut(age, 7)(62.3,71.1]   29.636      7.132   4.155    0.000
## cut(age, 7)(71.1,80.1]   26.410     12.555   2.103    0.036
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds_cut <- predict(fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds_cut$fit + 2 * preds_cut$se.fit, 
                  preds_cut$fit - 2 * preds_cut$se.fit)

options(repr.plot.width=12, repr.plot.height=6)

plot(age, wage, xlim=agelims, cex=.5, col="darkgrey", xlab = "age", ylab = "wage")
title("7-cut stepwise function")
lines(age.grid, preds_cut$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

print(paste0("the mse is " , mean((predict(fit, data.frame(age = Wage.test$age)) - Wage.test$wage)^2)))
## [1] "the mse is 1620.41538330396"

The MSE for stepwise function is 1620.4153. I use 7 cut in this case. We can see the confidence band is getting large when the domain of age is greater than 60. Comparing the polynomials, it has better confidence band when the age is small.

# c). piece-wise function
grid1 <- seq(from = agelims[1], to = 33.5)
grid2 <- seq(from = 33.5, to = 49)
grid3 <- seq(from = 49, to = 64.5)
grid4 <- seq(from = 64.5, to = agelims[2])

pred1 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age < 33.5,]), newdata = list(age = grid1), se = TRUE)
bands1 <- cbind(pred1$fit + 2* pred1$se.fit, 
                  pred1$fit - 2 * pred1$se.fit)
pred2 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age > 33.5 & Wage.train$age <= 49,]), newdata = list(age = grid2), se = TRUE)
bands2 <- cbind(pred2$fit + 2* pred2$se.fit, 
                  pred2$fit - 2 * pred2$se.fit)
pred3 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age > 49 & Wage.train$age <= 64.5,]), newdata = list(age = grid3), se = TRUE)
bands3 <- cbind(pred3$fit + 2* pred3$se.fit, 
                  pred3$fit - 2 * pred3$se.fit)
pred4 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age > 64.5,]), newdata = list(age = grid4), se = TRUE)
bands4 <- cbind(pred4$fit + 2* pred4$se.fit, 
                  pred4$fit - 2 * pred4$se.fit)

plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey", xlab = "age", ylab = "wage", main = "piecewise function")
matlines(grid1, bands1, lwd=1, col="blue", ity = "dashed")
matlines(grid2, bands2, lwd=1, col="blue", ity = "dashed")
matlines(grid3, bands3, lwd=1, col="blue", ity = "dashed")
matlines(grid4, bands4, lwd=1, col="blue", ity = "dashed")
lines(grid1, pred1$fit, lwd=2, col="blue")
lines(grid2, pred2$fit, lwd=2, col="blue")
lines(grid3, pred3$fit, lwd=2, col="blue")
lines(grid4, pred4$fit, lwd=2, col="blue")

I fit the piecewise function with three knots: 33.5, 49, 64.5 for four intervals with degree of 3. The function is not continuous. The confidence bond is getting wider at the fourth interval which is for age greater than 64.5.

# d). cubic spline
library(splines)
fit <- lm(wage ~ bs(age, knots = c(25, 40, 60)), data = Wage, subset = train)
pred_bs <- predict(fit, newdata = list(age = age.grid), se = T)

summary(fit)
## 
## Call:
## lm(formula = wage ~ bs(age, knots = c(25, 40, 60)), data = Wage, 
##     subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.088 -24.749  -4.806  14.801 199.796 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      61.0395    15.1852   4.020 6.12e-05 ***
## bs(age, knots = c(25, 40, 60))1   0.9202    19.6111   0.047  0.96258    
## bs(age, knots = c(25, 40, 60))2  49.2776    15.1583   3.251  0.00118 ** 
## bs(age, knots = c(25, 40, 60))3  54.6686    17.0136   3.213  0.00134 ** 
## bs(age, knots = c(25, 40, 60))4  66.9080    16.5526   4.042 5.57e-05 ***
## bs(age, knots = c(25, 40, 60))5  45.7181    21.8860   2.089  0.03688 *  
## bs(age, knots = c(25, 40, 60))6  31.0702    27.7077   1.121  0.26232    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.94 on 1493 degrees of freedom
## Multiple R-squared:  0.08677,    Adjusted R-squared:  0.0831 
## F-statistic: 23.64 on 6 and 1493 DF,  p-value: < 2.2e-16
print(paste0("the mse is " , mean((predict(fit, data.frame(age = Wage.test$age)) - Wage.test$wage)^2)))
## [1] "the mse is 1600.354279369"
par(mfrow=c(1, 2))
plot(age, wage, col="gray", main="Cubic regression spline", cex=.5, xlab = "age", ylab = "wage")
lines(age.grid, pred_bs$fit, lwd=2)
lines(age.grid, pred_bs$fit + 2 * pred_bs$se, lty="dashed")
lines(age.grid, pred_bs$fit - 2 * pred_bs$se, lty="dashed")
abline(v=25, lty="dashed", col="blue")
abline(v=40, lty="dashed", col="blue")
abline(v=60, lty="dashed", col="blue")

For cubic spline, I used the knots (25, 40, 60). The confidence bonds on two ends are getting wider as well. The MSE is 1600.3543

# e). smoothing spline
fit <- smooth.spline(Wage.train$age, Wage.train$wage, cv = TRUE)
## Warning in smooth.spline(Wage.train$age, Wage.train$wage, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
fit2 <- smooth.spline(Wage.train$age, Wage.train$wage, df = 5)
summary(fit)
##            Length Class             Mode   
## x          61     -none-            numeric
## y          61     -none-            numeric
## w          61     -none-            numeric
## yin        61     -none-            numeric
## tol         1     -none-            numeric
## data        3     -none-            list   
## no.weights  1     -none-            logical
## n           1     -none-            numeric
## lev        61     -none-            numeric
## cv          1     -none-            logical
## cv.crit     1     -none-            numeric
## pen.crit    1     -none-            numeric
## crit        1     -none-            numeric
## df          1     -none-            numeric
## spar        1     -none-            numeric
## ratio       1     -none-            numeric
## lambda      1     -none-            numeric
## iparms      5     -none-            numeric
## auxM        0     -none-            NULL   
## fit         5     smooth.spline.fit list   
## call        4     -none-            call
plot(age, wage, xlim = agelims, cex=.5, col="darkgrey", main="Smoothing Spline", xlab = "age", ylab = "wage")
lines(fit, col = "blue", lwd = 2)
lines(fit2, col = "red", lwd = 2)

I choose df = 5 and fit into the training data. We could see the red line represents the df = 5 and the blue line is the default. The two lines match with each other very well, which is much better than the previous models. I think this one yielding the best result on the test set.

Problem 5.

#Auto
df <- subset(Auto, select = -name)
head(df)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
a).
set.seed(1)
train <- sample(1: nrow(df), nrow(df) / 2)
auto.test <- df[-train, "mpg"]

tree.auto <- tree(mpg ~ ., df, subset = train, control = tree.control(nobs = length(train), mindev = 0))
summary(tree.auto)
## 
## Regression tree:
## tree(formula = mpg ~ ., data = df, subset = train, control = tree.control(nobs = length(train), 
##     mindev = 0))
## Variables actually used in tree construction:
## [1] "displacement" "horsepower"   "year"         "acceleration" "weight"      
## Number of terminal nodes:  32 
## Residual mean deviance:  6.199 = 1017 / 164 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -7.16700 -1.18800 -0.04833  0.00000  0.96790 14.13000
#big tree
plot(tree.auto)
text(tree.auto, pretty = 0)

#pruned tree
prune.auto <- prune.tree(tree.auto, best = 7) # pruned tree
plot(prune.auto)
text(prune.auto , pretty = 0)

tree.pred <- predict(tree.auto, newdata = df[-train,]) 
plot(tree.pred, auto.test, xlab = "prediction", ylab = "value in test set")
abline(0,1)

mean((tree.pred - auto.test)^2) #the test MSE for unpruned tree
## [1] 9.366727
for (i in 2:10) {
  prune.auto <- prune.tree(tree.auto, best = i)
  prune.pred <- predict(prune.auto, newdata = df[-train,]) 
  MSE <- round(mean((prune.pred - auto.test)^2), 4)
  print(paste0("The MSE is ", MSE, " when best is ", i)) #the test MSE for pruned tree
}
## [1] "The MSE is 26.0513 when best is 2"
## [1] "The MSE is 20.0664 when best is 3"
## [1] "The MSE is 18.7148 when best is 4"
## [1] "The MSE is 16.6175 when best is 5"
## [1] "The MSE is 13.306 when best is 6"
## [1] "The MSE is 12.8271 when best is 7"
## [1] "The MSE is 12.6116 when best is 8"
## [1] "The MSE is 11.578 when best is 9"
## [1] "The MSE is 11.6808 when best is 10"

The MSE for unpruned tree is 9.366727. The MSE for pruned tree with size from 2-10 are 26.0513, 20.0664, 18.7148, 16.6175, 13.306, 12.8271, 12.6116, 11.578, 11.6808 respectively, which are all higher than the unpruned tree. The best size is 9 in this case.

b).
set.seed(1)
bag.auto <- randomForest(mpg ~ ., data = df, subset = train, mtry = 7, importance = TRUE)
bag.auto
## 
## Call:
##  randomForest(formula = mpg ~ ., data = df, mtry = 7, importance = TRUE,      subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##           Mean of squared residuals: 9.481201
##                     % Var explained: 85.59
bag.pred <- predict(bag.auto, newdata = df[-train,]) 
plot(bag.pred, auto.test, xlab = "bagging prediction", ylab = "value in test set")
abline(0, 1)

mean((bag.pred - auto.test)^2) 
## [1] 7.023526

The MSE for bagging is 7.023526. The tuning parameters is m=p=7 for regression tree.

c).
set.seed(1)
rf.auto <- randomForest(mpg ~ ., data = df, 
                         subset = train, mtry = 7/3, importance = T)
yhat.rf <- predict(rf.auto, newdata = df[-train, ])
mean((yhat.rf - auto.test)^2)
## [1] 6.958561
varImpPlot(rf.auto)

plot(yhat.rf, auto.test, xlab = "rf prediction", ylab = "value in test set")
abline(0, 1)

The MSE is 6.9586. The tuning parameter is m=p/3 = 7/3 since it’s for regression.

d).
library(gam)
## Loading required package: foreach
## Loaded gam 1.20.1
gam.m <- gam(mpg ~ s(cylinders) + s(displacement) + s(horsepower) + s(weight) + s(acceleration) + s(year) + origin, data = df, subset = train)
summary(gam.m)
## 
## Call: gam(formula = mpg ~ s(cylinders) + s(displacement) + s(horsepower) + 
##     s(weight) + s(acceleration) + s(year) + origin, data = df, 
##     subset = train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8945 -1.5559  0.1209  1.3163 11.8406 
## 
## (Dispersion Parameter for gaussian family taken to be 7.6361)
## 
##     Null Deviance: 12892.19 on 195 degrees of freedom
## Residual Deviance: 1298.132 on 169.9998 degrees of freedom
## AIC: 980.7754 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                  Df Sum Sq Mean Sq   F value    Pr(>F)    
## s(cylinders)      1 8112.0  8112.0 1062.3291 < 2.2e-16 ***
## s(displacement)   1  829.5   829.5  108.6311 < 2.2e-16 ***
## s(horsepower)     1  163.5   163.5   21.4092 7.327e-06 ***
## s(weight)         1  140.6   140.6   18.4188 2.970e-05 ***
## s(acceleration)   1   62.3    62.3    8.1560  0.004827 ** 
## s(year)           1 1342.3  1342.3  175.7797 < 2.2e-16 ***
## origin            1   37.0    37.0    4.8393  0.029167 *  
## Residuals       170 1298.1     7.6                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                 Npar Df  Npar F     Pr(F)    
## (Intercept)                                  
## s(cylinders)          3  8.8247 1.805e-05 ***
## s(displacement)       3  2.6158 0.0527740 .  
## s(horsepower)         3  1.7774 0.1533438    
## s(weight)             3 10.2654 3.014e-06 ***
## s(acceleration)       3  8.6317 2.299e-05 ***
## s(year)               3  7.4170 0.0001068 ***
## origin                                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1, 3))
plot(gam.m, se=TRUE, col="blue")

yhat.gam <- predict(gam.m, newdata = df[-train, ])
mean((yhat.gam - auto.test)^2)
## [1] 6.470187

The MSE is 6.47. The “Anova for Parametric effects” p-values clearly demonstrate that all predictors are highly statistically significant, even when assuming a linear relationship. The “Anova for Nonparametrically effects” p-values for displacement and horsepower. The large p-value for these two predictors shows that a linear function is adequate for these variable.

e).

I prefer regression tree model. For accuracy, its MSE is 9.3667 which is not too away from gam tree which is 6.47. I believe regression tree is the easiest one to interpret for all groups of people since it just simply split the data into two groups repeatedly to find the best classification.